import glob
from pathlib import Path

import pandas as pd


def main() -> None:
    root = Path(__file__).resolve().parents[1]
    in_dir = root / "data_external"

    paths = sorted(glob.glob(str(in_dir / "broadband_region_year_eurostat_isoc_r_broad_h_long_*_*.csv")))
    if not paths:
        raise SystemExit("No Eurostat long files found in data_external/")

    frames = []
    for p in paths:
        df = pd.read_csv(p)
        if df.empty:
            continue
        frames.append(df)

    if not frames:
        raise SystemExit("Eurostat long files are empty.")

    long = pd.concat(frames, ignore_index=True)
    # normalize
    long["region"] = long["region"].astype(str).str.upper().str.strip()
    long["unit"] = long["unit"].astype(str).str.upper().str.strip()
    long["year"] = pd.to_numeric(long["year"], errors="coerce").astype("Int64")
    long["value"] = pd.to_numeric(long["value"], errors="coerce")
    # de-dup (some runs can append duplicates)
    long = long.drop_duplicates(subset=["region", "year", "unit"], keep="first")

    wide = (
        long.pivot_table(index=["region", "year"], columns="unit", values="value", aggfunc="first")
        .reset_index()
        .rename(columns={"PC_HH": "broadband_pc_hh", "PC_HH_IACC": "internet_access_pc_hh"})
    )

    out_wide = in_dir / "broadband_region_year_eurostat_isoc_r_broad_h.csv"
    wide.to_csv(out_wide, index=False, encoding="utf-8")

    # Fill the project template (preferred schema)
    tmpl = pd.DataFrame(
        {
            "cntry": pd.NA,
            "regunit": pd.NA,
            "region": wide["region"],
            "year": wide["year"],
            "fixed_broadband_coverage_pct": wide.get("broadband_pc_hh"),
            "mobile_coverage_pct": pd.NA,
            "avg_download_mbps": pd.NA,
            "source": "Eurostat API (isoc_r_broad_h)",
            "notes": "broadband_pc_hh and internet_access_pc_hh available in broadband_region_year_eurostat_isoc_r_broad_h.csv",
        }
    )

    # If any long file has cntry/regunit columns, try to keep them (best effort)
    if "cntry" in long.columns:
        cntry_map = long[["region", "cntry"]].dropna().drop_duplicates().set_index("region")["cntry"].to_dict()
        tmpl["cntry"] = tmpl["region"].map(cntry_map)
    if "regunit" in long.columns:
        regunit_map = long[["region", "regunit"]].dropna().drop_duplicates().set_index("region")["regunit"].to_dict()
        tmpl["regunit"] = tmpl["region"].map(regunit_map)

    out_tmpl = in_dir / "broadband_region_year.csv"
    tmpl.to_csv(out_tmpl, index=False, encoding="utf-8")

    print(f"Wrote: {out_wide} rows={len(wide)}")
    print(f"Wrote: {out_tmpl} rows={len(tmpl)}")


if __name__ == "__main__":
    main()

